import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import ensemble
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from IPython.display import Image
from sklearn.tree import export_graphviz
import graphviz
from subprocess import call
from pprint import pprint
from sklearn.linear_model import LogisticRegression
df = pd.read_csv("merged_dataset.csv", sep=',')
df.drop('Unnamed: 0', axis=1, inplace=True)
df_train, df_test = train_test_split(df, test_size=0.15)
Algoritmus zahrnujuci 1 pravidlo (čiže jeden stĺpec). Vybrali sme si hbver, pretože v predošlom zadadní nám vyšiel ako najlepší atribút. A budeme mať pravidlo, že ak hbver je väčší ako 0.5 tak pacient bude pravdepodobne zdravý. Ak sa pozrieme na boxplot, môžeme túto hodnotu tam približne aj vidieť.
$Accuracy = \frac{TP+TN}{TP+FP+FN+TN}$
$Precision = \frac{TP}{TP+FP}$
$Recall = \frac{TP}{TP+FN}$
datas_from_classification = dict()
def print_classf_data(key):
global datas_from_classification
print("Name: Values:")
print()
print("Total: ", datas_from_classification[key]["ALL"])
print("TP: ", datas_from_classification[key]["TP"])
print("TN: ", datas_from_classification[key]["TN"])
print("FP: ", datas_from_classification[key]["FP"])
print("FN: ", datas_from_classification[key]["FN"])
mat = [[datas_from_classification[key]["TN"], datas_from_classification[key]["FP"]], [ datas_from_classification[key]["FN"], datas_from_classification[key]["TP"]]]
sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
accuracy = (datas_from_classification[key]["TP"] + datas_from_classification[key]["TN"]) / datas_from_classification[key]["ALL"]
precision = datas_from_classification[key]["TP"] / (datas_from_classification[key]["FP"] + datas_from_classification[key]["TP"])
recall = datas_from_classification[key]["TP"] / (datas_from_classification[key]["FN"] + datas_from_classification[key]["TP"])
print("Accuracy: ",accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print()
print(sns.boxplot(y="hbver", x="indicator", data=df))
AxesSubplot(0.125,0.125;0.775x0.755)
def algorithm1R(df, key):
global datas_from_classification
datas_from_classification[key] = dict()
datas_from_classification[key]["ALL"] = len(df)
datas_from_classification[key]["TP"] = len(df.loc[(df.hbver > 0.5) & (df['indicator'] == 0.0)])
datas_from_classification[key]["TN"] = len(df.loc[(df.hbver < 0.5) & (df['indicator'] == 1.0)])
datas_from_classification[key]["FP"] = len(df.loc[(df.hbver > 0.5) & (df['indicator'] == 1.0)])
datas_from_classification[key]["FN"] = len(df.loc[(df.hbver < 0.5) & (df['indicator'] == 0.0)])
print_classf_data(key)
datas_from_classification[key]["VALS"] = np.array((df.apply(lambda row: 0.0 if row.hbver > 0.5 else 1.0, axis=1)).values.tolist())
datas_from_classification[key]["CR"] = classification_report(df['indicator'],datas_from_classification[key]["VALS"], target_names = ["0","1"])
print(datas_from_classification[key]["CR"])
algorithm1R(df_train,"train_1R")
Name: Values:
Total: 7058
TP: 1872
TN: 2945
FP: 1508
FN: 733
Accuracy: 0.6824879569283083
Precision: 0.5538461538461539
Recall: 0.7186180422264875
precision recall f1-score support
0 0.55 0.72 0.63 2605
1 0.80 0.66 0.72 4453
accuracy 0.68 7058
macro avg 0.68 0.69 0.67 7058
weighted avg 0.71 0.68 0.69 7058
algorithm1R(df_test,"test_1R")
Name: Values:
Total: 1246
TP: 345
TN: 527
FP: 243
FN: 131
Accuracy: 0.6998394863563403
Precision: 0.5867346938775511
Recall: 0.7247899159663865
precision recall f1-score support
0 0.59 0.72 0.65 476
1 0.80 0.68 0.74 770
accuracy 0.70 1246
macro avg 0.69 0.70 0.69 1246
weighted avg 0.72 0.70 0.70 1246
Ako môžeme vidieť náš 1R algoritmus nerobí nejaké divy ale dosiahli sme accuracy 70 percent, čo je podľa nás celkom slušné na to, že využívame iba jeden algoritmus. Dokonca v testovacej vzorke sme dosiahli aj lepšie hodnoty accuracy, precision a recall.
Algoritmus zahrnujuci 2 pravidlá (čiže dva stĺpce). Vybrali sme si hbver a leukocyty, pretože v predošlom zadadní nám vyšli ako najlepšie. A budeme mať pravidlo, že ak hbver je väčší ako 0.5 a leukocyty viac ako 0.25, tak pacient bude pravdepodobne zdravý. Tieto hodnoty sme vyčítali z grafu nižšie zo scatterplotu.
sns.scatterplot(y="hbver", x="leukocyty", hue="indicator", data=df)
<AxesSubplot:xlabel='leukocyty', ylabel='hbver'>
def algorithm2R(df, key):
global datas_from_classification
datas_from_classification[key] = dict()
datas_from_classification[key]["ALL"] = len(df)
datas_from_classification[key]["TP"] = len(df.loc[((df.hbver > 0.5) & (df.leukocyty > 0.25)) & (df['indicator'] == 0.0)])
datas_from_classification[key]["TN"] = len(df.loc[((df.hbver < 0.5) | (df.leukocyty < 0.25)) & (df['indicator'] == 1.0)])
datas_from_classification[key]["FP"] = len(df.loc[((df.hbver > 0.5) & (df.leukocyty > 0.25)) & (df['indicator'] == 1.0)])
datas_from_classification[key]["FN"] = len(df.loc[((df.hbver < 0.5) | (df.leukocyty < 0.25)) & (df['indicator'] == 0.0)])
print_classf_data(key)
datas_from_classification[key]["VALS"] = np.array((df.apply(lambda row: 0.0 if ((row.hbver > 0.5) & (row.leukocyty > 0.25)) else 1.0, axis=1)).values.tolist())
datas_from_classification[key]["CR"] = classification_report(df['indicator'],datas_from_classification[key]["VALS"], target_names = ["0","1"])
print(datas_from_classification[key]["CR"])
algorithm2R(df_train,"train_2R")
Name: Values:
Total: 7058
TP: 1850
TN: 3565
FP: 888
FN: 755
Accuracy: 0.7672145083593086
Precision: 0.6756756756756757
Recall: 0.710172744721689
precision recall f1-score support
0 0.68 0.71 0.69 2605
1 0.83 0.80 0.81 4453
accuracy 0.77 7058
macro avg 0.75 0.76 0.75 7058
weighted avg 0.77 0.77 0.77 7058
algorithm2R(df_test,"test_2R")
Name: Values:
Total: 1246
TP: 337
TN: 637
FP: 133
FN: 139
Accuracy: 0.7817014446227929
Precision: 0.7170212765957447
Recall: 0.707983193277311
precision recall f1-score support
0 0.72 0.71 0.71 476
1 0.82 0.83 0.82 770
accuracy 0.78 1246
macro avg 0.77 0.77 0.77 1246
weighted avg 0.78 0.78 0.78 1246
Ako vidíme náš algoritmus, ktorý používa už dva parametre dosahuje lepšie výsledky. ide už okolo 80 percent accuracy. Takisto ako pri 1R vidíme, že testovacia čast dopadla opäť lepšie v každej metrike.
train_output = df_train['indicator']
test_output = df_test['indicator']
columns = ['age','sex','race','hemoglobin','alt','alp','trombocyty','relationship','ast','weight','indicator']
df_train_class = df_train.drop(columns, axis=1)
df_test_class = df_test.drop(columns, axis=1)
Z celeho datasetu sme vybrali len tie stĺpce, ktoré nám anova dala ako najlepšie predošlého zadania. A to sú hbver, leukocyty, blood_group, smoker, atd..
model = RandomForestClassifier().fit(df_train_class, train_output)
predicted_labels = model.predict(df_test_class)
print('Training accuracy: ',model.score(df_train_class,train_output))
print('Test Accuracy: ',model.score(df_test_class, test_output))
Training accuracy: 1.0 Test Accuracy: 0.9502407704654896
Vidíme, že sme dosiahli celkom dobré výsledky aj na trénovacej a testovacej. Nemáme ani veľký overfitting.
export_graphviz(model.estimators_[1],
out_file='tree.dot',
feature_names=df_test_class.columns,
class_names = ["0","1"],
rounded = True, proportion = False,
precision = 2, filled = True)
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
Image(filename = 'tree.png')